library(tidyverse)
library(janitor)
library(leaflet)
library(readxl)
library(easystats)
df <- read_csv("./ufo_data.csv") %>%
janitor::clean_names()
glimpse(df)
## Rows: 80,332
## Columns: 11
## $ datetime <chr> "10/10/1949 20:30", "10/10/1949 21:00", "10/10/1955…
## $ city <chr> "san marcos", "lackland afb", "chester (uk/england)…
## $ state <chr> "tx", "tx", NA, "tx", "hi", "tn", NA, "ct", "al", "…
## $ country <chr> "us", NA, "gb", "us", "us", "us", "gb", "us", "us",…
## $ shape <chr> "cylinder", "light", "circle", "circle", "light", "…
## $ duration_seconds <dbl> 2700, 7200, 20, 20, 900, 300, 180, 1200, 180, 120, …
## $ duration_hours_min <chr> "45 minutes", "1-2 hrs", "20 seconds", "1/2 hour", …
## $ comments <chr> "This event took place in early fall around 1949-50…
## $ date_posted <chr> "4/27/2004", "12/16/2005", "1/21/2008", "1/17/2004"…
## $ latitude <dbl> 29.88306, 29.38421, 53.20000, 28.97833, 21.41806, 3…
## $ longitude <dbl> -97.941111, -98.581082, -2.916667, -96.645833, -157…
As we can see above, there is some cleaning to be done before we get started on analyzing any data.
This analysis is only going to focus on UFO sightings within the United States, so I need the data to reflect that.
df <- df %>%
mutate(country = case_when(
state %in% c("al", "ak", "az", "ar", "ca", "co", "ct", "de", "dc", "fl",
"ga", "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me",
"md", "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh",
"nj", "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri",
"sc", "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy") ~ "us",
TRUE ~ "other"
))
df <- df %>%
filter(country == "us")
One would expect the above code to adequately filter submissions but when looking at latitudes and longitudes, there are still some entries that fall outside of the United States. To fix that, I’m going to create some boundaries.
continental_us <- list(
xmin = -125.0, # Westernmost point
xmax = -66.93457, # Easternmost point
ymin = 24.396308, # Southernmost point
ymax = 49.384358 # Northernmost point
)
alaska_hawaii <- list(
xmin = -178.2166, # Westernmost point
xmax = -129.9943, # Easternmost point
ymin = 18.9117, # Southernmost point
ymax = 71.5388 # Northernmost point
)
continental_us_data <- df %>%
filter(latitude >= continental_us$ymin & latitude <= continental_us$ymax &
longitude >= continental_us$xmin & longitude <= continental_us$xmax)
alaska_hawaii_data <- df %>%
filter(latitude >= alaska_hawaii$ymin & latitude <= alaska_hawaii$ymax &
longitude >= alaska_hawaii$xmin & longitude <= alaska_hawaii$xmax)
df <- rbind(continental_us_data, alaska_hawaii_data)
Another problem with this data is the shapes of UFOs.
unique(df$shape)
## [1] "cylinder" "light" "circle" "sphere" "disk" "fireball"
## [7] "unknown" "oval" "other" "rectangle" "chevron" "formation"
## [13] "triangle" "cigar" NA "delta" "changing" "diamond"
## [19] "flash" "egg" "teardrop" "cone" "cross" "pyramid"
## [25] "round" "crescent" "flare" "hexagon" "dome" "changed"
There are a lot of values that could mean the same thing, so we’re going to group those together.
df <- df %>%
mutate(shape = case_when(
shape %in% c("light", "fireball", "flash", "flare") ~ "light",
shape %in% c("circle", "sphere", "egg", "oval", "disk", "round") ~ "spherical",
shape %in% c("cylinder", "rectangle", "cigar") ~ "rectangular",
shape %in% c("triangle", "pyramid") ~ "triangular",
is.na(shape) | shape %in% c("unknown", "other", "changing", "changed", "formation") ~ "other",
shape %in% c("delta", "chevron") ~ "delta",
shape %in% c("diamond", "hexagon") ~ "diamond",
shape %in% c("cone", "dome") ~ "cone",
TRUE ~ as.character(shape)))
unique(df$shape)
## [1] "rectangular" "light" "spherical" "other" "delta"
## [6] "triangular" "diamond" "teardrop" "cone" "cross"
## [11] "crescent"
First, let’s put up a map of all the UFO sightings in the United States: